#Part I

Data Wrangling

In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.

stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video

#Load the package(s) you just installed
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
D1 <- read.csv("video-data.csv", header = TRUE)

#Create a data frame that only contains the years 2018
D2 <- filter(D1, year == 2018)

Histograms

#Generate a histogram of the watch time for the year 2018

hist(D2$watch.time)

#Change the number of breaks to 100, do you get the same impression?

hist(D2$watch.time, breaks = 100)

#Cut the y-axis off at 10

hist(D2$watch.time, breaks = 100, ylim = c(0,10))

#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35

hist(D2$watch.time, breaks = c(0,5,20,25,35))

Plots

#Plot the number of confusion points against the watch time

plot(D1$confusion.points, D1$watch.time)

#Create two variables x & y
x <- c(1,3,2,7,6,4,4)
y <- c(2,4,2,3,2,4,3)

#Create a table from x & y
table1 <- table(x,y)

#Display the table as a Barplot
barplot(table1)

#Create a data frame of the average total key points for each year and plot the two against each other as a lines

D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")

#Create a boxplot of total enrollment for three students
D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
#The drop levels command will remove all the schools from the variable with no data  
D4 <- droplevels(D4)
boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")

## Pairs

#Use matrix notation to select columns 2, 5, 6, and 7
D5 <- D1[,c(2,5,6,7)]
#Draw a matrix of plots for every combination of variables
pairs(D5)

## Part II

  1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
score <- rnorm(100,75,15)
hist(score, breaks = 30)

S1 <- data.frame(score)

library(dplyr)
S1 <- filter(S1, score <= 100)
hist(S1$score)

S2 <- data.frame(rep(100,100-nrow(S1)))
names(S2) <- "score"
S3 <- bind_rows(S1,S2)
S3$score <- round(S3$score,0)

interest <- c("sport","music","nature","literature")

S3$interest <- sample(interest,100,replace = TRUE)

S3$stid <- seq(1,100,1)
  1. Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
hist(S3$score,breaks = 10)

  1. Create a new variable that groups the scores according to the breaks in your histogram.
label <- letters[1:10]

S3$breaks <- cut(S3$score, breaks = 10, labels = label)
  1. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram.
library(RColorBrewer)
#Let's look at the available palettes in RColorBrewer
display.brewer.all()

#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
#Make RColorBrewer palette available to R and assign to your bins
S3$colors <- brewer.pal(10,"BrBG")
#Use named palette in histogram
hist(S3$score, breaks = 10, col = S3$colors)

  1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
#Make a vector of the colors from RColorBrewer
interest.col <- brewer.pal(4,"Set3")

boxplot(score~interest, S3, col = interest.col)

  1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
S3$login <- sample(1:25, 100,replace = TRUE)
  1. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
plot(S3$login,S3$score, col = S3$colors, main = "Student Logins vs. Scores")

  1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
AP <- data.frame(AirPassengers)

plot(AirPassengers)

  1. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropraiet to run a correlation on?
Iris <- data.frame(iris)

plot(Iris)
pairs(Iris)

Sepal.Length vs Petal.Length; Sepal.Length vs Petal.Width;Sepal Width vs Petal.Width; petal Length vs Petal.Width

Part III - Analyzing Swirl

Data

In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.

Instructions

  1. Insert a new code block
  2. Create a data frame from the swirl-data.csv file called DF1
DF1 <- read.csv("swirl-data.csv",header = TRUE)

The variables are:

course_name - the name of the R course the student attempted
lesson_name - the lesson name
question_number - the question number attempted correct - whether the question was answered correctly
attempt - how many times the student attempted the question
skipped - whether the student skipped the question
datetime - the date and time the student attempted the question
hash - anonymyzed student ID

  1. Create a new data frame that only includes the variables hash, lesson_name and attempt called DF2
DF2 <- select (DF1,hash,lesson_name,attempt)
  1. Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3
DF3 <- DF2 %>% group_by(hash,lesson_name) %>% summarize(TotalAttempts = sum(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
  1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names #my answer

  2. Convert DF3 to this format

library(ggplot2)

ggplot(DF3,aes(x=lesson_name,y=hash,color = TotalAttempts)) + geom_point() + expand_limits(y=0)

  1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct
DF4 <- select(DF1,hash,lesson_name,correct)
  1. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0
DF4$correct <- ifelse(DF4$correct == "TRUE",1,0)
  1. Create a new data frame called DF5 that provides a mean score for each student on each course
DF5 <- DF1%>% group_by(hash,course_name)%>%summarise(meanscore = mean(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
  1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
DF <- select(DF1,correct,datetime)
DF$correct <-ifelse(DF$correct == TRUE,1,0)
DF$datetime <- as.POSIXct(DF$datetime,origin = "1970-01-01 00:00.00 UTC")
DF$datetime <- strftime(DF$datetime,format = "%m-%d-%y")
DF6 <- summarise(group_by(DF,datetime),average = mean(correct,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)